library(rvest)
library(tidyverse)
url <-"html_top100.txt"
college_urls <- url %>%
read_html() %>%
html_node("body") %>% html_nodes("ol[class~=bEyEue]") %>% html_nodes("li[id]")%>% html_nodes("h3") %>%
html_nodes("a[href]") %>%
html_attr("href")
head(college_urls)
## [1] "/best-colleges/princeton-university-2627"
## [2] "/best-colleges/harvard-university-2155"
## [3] "/best-colleges/columbia-university-2707"
## [4] "/best-colleges/massachusetts-institute-of-technology-2178"
## [5] "/best-colleges/university-of-chicago-1774"
## [6] "/best-colleges/yale-university-1426"
index_num <- 0
college_tab_1 <- data.frame("URL" = gsub(" ", "", paste("https://www.usnews.com",college_urls, sep = "")),
"CollegeName"= "", "TuitionFeesThousands" = 0, "RoomBoardThousands" = 0, "TotalEnrollment" = 0, "SchoolType" = "", "YearFounded" = 0, "Setting" = "", "Endowment2017Millions" = 0, "MedianStartingSalaryOfAlumniThousands" = 0, "Selectivity" = "", "Fall2017AcceptanceRate" = 0, "MalePercentage" = 0, "FourYearGraduationRate" = 0, stringsAsFactors = FALSE)
#removing one college that doesn't have a median starting salary, for data uniformity
college_tab_1 <- college_tab_1[-c(40),]
head(college_tab_1)
## URL
## 1 https://www.usnews.com/best-colleges/princeton-university-2627
## 2 https://www.usnews.com/best-colleges/harvard-university-2155
## 3 https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6 https://www.usnews.com/best-colleges/yale-university-1426
## CollegeName TuitionFeesThousands RoomBoardThousands TotalEnrollment
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## SchoolType YearFounded Setting Endowment2017Millions
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## MedianStartingSalaryOfAlumniThousands Selectivity Fall2017AcceptanceRate
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## MalePercentage FourYearGraduationRate
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Below are functions used to obtain data from the website and parse it.
#retrieves of vector of size three containing the Tuition&Fees, Room&Board, and total enrollment
get_info <- function(url_html){
attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%
html_nodes("section[class~=hero-stats-widget-stats]") %>%
html_nodes("ul") %>% html_nodes("li") %>% html_nodes("strong")
}
#takes in a vector and index, and parses that information to a double
#ex: $47,263 -> 47263.0
get_tuition_rm <- function(info, num){
a_1 <- info[num] %>% html_text()
tuition_rm <-
as.double(paste(substring(a_1, 2, str_locate(a_1, ",")[1] - 1), substring(a_1, str_locate(a_1, ",")[1] + 1, str_locate(a_1, " ")[1] - 1), sep=""))
tuition_rm / 1000.0
}
#takes in a vector and parses the total enrollment information to a double
get_enrollment <- function(info){
a_1 <- info[3] %>% html_text()
as.double(paste(substring(a_1, 1, str_locate(a_1, ",")[1] - 1), substring(a_1, str_locate(a_1, ",")[1] + 1), sep=""))
}
#gets the percentage of the majority gender at a certain university
get_percent <- function(url_html){
attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%
html_nodes("div[class~=block-normal]") %>% html_nodes("span[class~=distribution-breakdown__percentage]") %>% html_text()
as.double(substring(attr, 1, str_locate(attr, "%")[1] - 1)) / 100.0
}
#retrieves the gender of the majority sex and parses the percentage to be in terms of males
get_gender_ratio <- function(url_html){
attr <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%
html_nodes("div[class~=block-normal]") %>% html_nodes("span[class~=distribution-breakdown__percentage-copy]") %>% html_text()
attr <- sub("\n ","",attr)
attr <- sub("\n ","",attr)
if (attr == "Female"){
1 - get_percent(url_html)
}else{
get_percent(url_html)
}
}
Here, we use both the functions above and the html_node function to fill out the table.
college_tab <- college_tab_1
for (i in 1:nrow(college_tab)){
url_html <- college_tab[i,1] %>%read_html()
college_tab[i,]$CollegeName <- url_html %>% html_node("body") %>% html_nodes("h1[class~=hero-heading]") %>% html_text()
priv_tuition <- url_html %>% html_node("body") %>% html_nodes("span[data-test-id~=v_private_tuition]") %>% html_text()
college_tab[i,]$TuitionFeesThousands <- ifelse(length(priv_tuition) > 0, priv_tuition,
url_html %>% html_node("body") %>% html_node("span[data-test-id~=v_out_state_tuition]") %>% html_text())
college_tab[i,]$RoomBoardThousands <- url_html %>% html_node("body") %>% html_node("span[data-test-id~=w_room_board]") %>% html_text()
college_tab[i,]$TotalEnrollment <- url_html %>% html_node("body") %>% html_node("span[data-test-id~=total_all_students]") %>% html_text()
college_tab[i,]$MalePercentage <- get_gender_ratio(url_html)
college_tab[i,]$Fall2017AcceptanceRate <- url_html %>% html_node("span[data-test-id~=r_c_accept_rate]") %>% html_text()
college_tab[i,]$Selectivity <- url_html %>% html_node("span[data-test-id~=c_select_class]") %>% html_text()
college_tab[i,]$FourYearGraduationRate <- url_html %>% html_node("span[data-test-id~=grad_rate_4_year]") %>% html_text()
college_tab[i,]$MedianStartingSalaryOfAlumniThousands <- url_html %>% html_nodes("div[data-field-id=averageStartSalary]") %>%html_node("span[data-test-id]") %>% html_text()
temp_vector <- url_html %>% html_node("body") %>% html_nodes("div[id~=content-main]") %>%html_nodes("div[class~=flex-row]") %>% html_nodes("span[class~=heading-small]") %>% html_text()
college_tab[i,]$SchoolType <- temp_vector[1]
college_tab[i,]$YearFounded <- temp_vector[2]
college_tab[i,]$Setting <- temp_vector[5]
college_tab[i,]$Endowment2017Millions <- temp_vector[6]
}
head(college_tab)
## URL
## 1 https://www.usnews.com/best-colleges/princeton-university-2627
## 2 https://www.usnews.com/best-colleges/harvard-university-2155
## 3 https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6 https://www.usnews.com/best-colleges/yale-university-1426
## CollegeName
## 1 \n Princeton University\n
## 2 \n Harvard University\n
## 3 \n Columbia University\n
## 4 \n Massachusetts Institute of Technology\n
## 5 \n University of Chicago\n
## 6 \n Yale University\n
## TuitionFeesThousands RoomBoardThousands
## 1 \n $47,140 (2018-19) \n $15,610 (2018-19)
## 2 \n $50,420 (2018-19) \n $17,160 (2018-19)
## 3 \n $59,430 (2018-19) \n $14,016 (2018-19)
## 4 \n $51,832 (2018-19) \n $15,510 (2018-19)
## 5 \n $57,006 (2018-19) \n $16,350 (2018-19)
## 6 \n $53,430 (2018-19) \n $16,000 (2018-19)
## TotalEnrollment SchoolType YearFounded Setting
## 1 \n 8,273 Private, Coed 1746 Suburban
## 2 \n 20,604 Private, Coed 1636 Urban
## 3 \n 25,968 Private, Coed 1754 Urban
## 4 \n 11,466 Private, Coed 1861 Urban
## 5 \n 13,736 Private, Coed 1890 Urban
## 6 \n 12,974 Private, Coed 1701 City
## Endowment2017Millions MedianStartingSalaryOfAlumniThousands
## 1 $23.4 billion \n $68,400*
## 2 $37.1 billion \n $66,500*
## 3 $10.0 billion \n $64,900*
## 4 $14.8 billion + \n $79,800*
## 5 $6.6 billion + \n $57,700*
## 6 $27.2 billion + \n $63,200*
## Selectivity Fall2017AcceptanceRate MalePercentage
## 1 \n Most selective \n 6% 0.51
## 2 \n Most selective \n 5% 0.52
## 3 \n Most selective \n 6% 0.52
## 4 \n Most selective \n 7% 0.54
## 5 \n Most selective \n 9% 0.51
## 6 \n Most selective \n 7% 0.50
## FourYearGraduationRate
## 1 \n 89%
## 2 \n 84%
## 3 \n 88%
## 4 \n 85%
## 5 \n 88%
## 6 \n 87%
Below, we reformat many of the columns to get usable data.
formatted_college_tab <- college_tab
#fix type of School Type, Setting, Year Founded
formatted_college_tab$SchoolType <- as.factor(formatted_college_tab$SchoolType)
formatted_college_tab$Setting <- as.factor(formatted_college_tab$Setting)
formatted_college_tab$YearFounded <- as.integer(formatted_college_tab$YearFounded)
#fix Endowment2017 formatting
formatted_college_tab$Endowment2017Millions <- ifelse(grepl("billion", formatted_college_tab$Endowment2017Millions ), sub("\\.","",formatted_college_tab$Endowment2017Millions ),formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <-sub(" billion","00",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <-sub(" million","",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <-sub("[[:punct:]]", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <-sub("\\$", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <-sub(" \\+", "",formatted_college_tab$Endowment2017Millions )
formatted_college_tab$Endowment2017Millions <- as.double(formatted_college_tab$Endowment2017Millions)
#fix College Name formatting
formatted_college_tab$CollegeName <- sub("^\n ","",formatted_college_tab$CollegeName)
formatted_college_tab$CollegeName <-sub("\n ","",formatted_college_tab$CollegeName)
#fixing Acceptance Rate formatting
formatted_college_tab$Fall2017AcceptanceRate <- sub("\n ","",formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- sub("%","",formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- as.double(formatted_college_tab$Fall2017AcceptanceRate)
formatted_college_tab$Fall2017AcceptanceRate <- formatted_college_tab$Fall2017AcceptanceRate/100
#fixing Grad Rate formatting
formatted_college_tab$FourYearGraduationRate <- sub("\n ","",formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- sub("%","",formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- as.double(formatted_college_tab$FourYearGraduationRate)
formatted_college_tab$FourYearGraduationRate <- formatted_college_tab$FourYearGraduationRate/100
#fixing Salary formatting
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <-
sub("\n ","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\*","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\$","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- gsub("\\,","",formatted_college_tab$MedianStartingSalaryOfAlumniThousands)
formatted_college_tab$MedianStartingSalaryOfAlumniThousands <- as.double(formatted_college_tab$MedianStartingSalaryOfAlumniThousands)/1000
#fixing Selectivity formatting
formatted_college_tab$Selectivity <- sub("\n ","",formatted_college_tab$Selectivity)
formatted_college_tab$Selectivity <- as.factor(formatted_college_tab$Selectivity)
#fixing Tuition formatting
formatted_college_tab$TuitionFeesThousands <- sub("\n ", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <- sub(" \\(2018-19\\)", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <-sub("\\,", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <-sub("\\$", "",formatted_college_tab$TuitionFeesThousands )
formatted_college_tab$TuitionFeesThousands <- as.double(formatted_college_tab$TuitionFeesThousands)/1000
## Warning: NAs introduced by coercion
#fixing RoomBoard formatting
formatted_college_tab$RoomBoardThousands <- sub("\n ", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <- sub(" \\(2018-19\\)", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <-sub("\\,", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <-sub("\\$", "",formatted_college_tab$RoomBoardThousands )
formatted_college_tab$RoomBoardThousands <- as.double(formatted_college_tab$RoomBoardThousands)/1000
## Warning: NAs introduced by coercion
#fixing Enrollment formatting
formatted_college_tab$TotalEnrollment <- sub("\n ", "",formatted_college_tab$TotalEnrollment )
formatted_college_tab$TotalEnrollment <-sub("\\,", "",formatted_college_tab$TotalEnrollment )
formatted_college_tab$TotalEnrollment <- as.double(formatted_college_tab$TotalEnrollment)
formatted_college_tab <- formatted_college_tab %>% mutate(TotalCostThousands =TuitionFeesThousands + RoomBoardThousands )
formatted_college_tab <- na.omit(formatted_college_tab)
nrow(formatted_college_tab)
## [1] 107
head(formatted_college_tab)
## URL
## 1 https://www.usnews.com/best-colleges/princeton-university-2627
## 2 https://www.usnews.com/best-colleges/harvard-university-2155
## 3 https://www.usnews.com/best-colleges/columbia-university-2707
## 4 https://www.usnews.com/best-colleges/massachusetts-institute-of-technology-2178
## 5 https://www.usnews.com/best-colleges/university-of-chicago-1774
## 6 https://www.usnews.com/best-colleges/yale-university-1426
## CollegeName TuitionFeesThousands
## 1 Princeton University 47.140
## 2 Harvard University 50.420
## 3 Columbia University 59.430
## 4 Massachusetts Institute of Technology 51.832
## 5 University of Chicago 57.006
## 6 Yale University 53.430
## RoomBoardThousands TotalEnrollment SchoolType YearFounded Setting
## 1 15.610 8273 Private, Coed 1746 Suburban
## 2 17.160 20604 Private, Coed 1636 Urban
## 3 14.016 25968 Private, Coed 1754 Urban
## 4 15.510 11466 Private, Coed 1861 Urban
## 5 16.350 13736 Private, Coed 1890 Urban
## 6 16.000 12974 Private, Coed 1701 City
## Endowment2017Millions MedianStartingSalaryOfAlumniThousands
## 1 23400 68.4
## 2 37100 66.5
## 3 10000 64.9
## 4 14800 79.8
## 5 6600 57.7
## 6 27200 63.2
## Selectivity Fall2017AcceptanceRate MalePercentage
## 1 Most selective 0.06 0.51
## 2 Most selective 0.05 0.52
## 3 Most selective 0.06 0.52
## 4 Most selective 0.07 0.54
## 5 Most selective 0.09 0.51
## 6 Most selective 0.07 0.50
## FourYearGraduationRate TotalCostThousands
## 1 0.89 62.750
## 2 0.84 67.580
## 3 0.88 73.446
## 4 0.85 67.342
## 5 0.88 73.356
## 6 0.87 69.430
{r} to save as csv to easily work on it without having to reload write.csv(formatted_college_tab, file = "college_info.csv") ``{r} formatted_college_tab <- read.csv(“college_info.csv”) formatted_college_tab <- formatted_college_tab[,-c(1)] formatted_college_tab ```
#Starting Salary
#-histograms
library(ggplot2)
plot_1 <- formatted_college_tab %>%
ggplot(aes(MedianStartingSalaryOfAlumniThousands)) +
geom_histogram()+
labs(title="Starting Salary Distribution", x="Median Starting Salary of Alumni (Thousands)", y="Count")
plot_1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of the median starting salary of alumni from all the school seems to be a bell-shaped curve, centering around $55,000.
#Tuition Cost
#-histograms
library(ggplot2)
plot_2 <- formatted_college_tab %>%
ggplot(aes(TuitionFeesThousands)) +
geom_histogram()+
labs(title="Tuition Cost Distribution", x="Tuition Cost (Thousands)", y="Count")
plot_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of tution costs of all the schools is bimodal, with a range of $60,000.
#Acceptance rate vs graduation rate
library(ggplot2)
plot_3 <- formatted_college_tab %>%
ggplot(aes(x=Fall2017AcceptanceRate, y=FourYearGraduationRate)) +
geom_point()+
geom_smooth(method=lm)+
labs(title="Acceptance Vs. Graduation Rate", x="Fall 2017 Acceptance Rate", y="Four Year Graduation Rate")
plot_3
There is a linear relationship between acceptance rate (Fall 2017) and the four year graduation rate. It is an overall negative relationship. The more selective, the lower the rate of graduation.
#Boxplots of (1) gradrate & (2) admission rate by selectivity
library(ggplot2)
plot_4 <- formatted_college_tab %>%
ggplot(aes(x=Selectivity, y=FourYearGraduationRate)) +
geom_boxplot()+
labs(title="Graduation Rate based on Selectivity", x="Selectivity Level", y="Four Year Graduation Rate")
plot_4
This is significant difference in four year graduation rates based on their Selectivity Level of accepting students. These boxplots show that each 3 selectivity level vary significantly on range and central tendency. The more selective a college is, the greater their graduation rates seem to be.
#Setting vs. room board
library(ggplot2)
plot_5 <- formatted_college_tab %>%
ggplot(aes(x=Setting, y=RoomBoardThousands)) +
geom_boxplot()+
labs(title="Setting vs. Room & Board Costs", x="Setting", y="Room & Board Costs (Thousands)")
plot_5
The boxplots of room & board costs based on setting shows that the setting of the college does influence the room and board costs for the students. The medians vary greatly. The interquartile spreads seems to be similar while the overall ranges vary.
plot_6 <- formatted_college_tab %>%
ggplot(aes(x=TotalCostThousands, y=MedianStartingSalaryOfAlumniThousands)) +
geom_point()+
geom_smooth(method=lm)+
labs(title="Total Cost vs. Median Starting Salary", x="Total Cost (Thousand)", y="Median Starting Salary Of Alumni (Thousands)")
plot_6
There appears to be a positive linear relatinoship between median starting salary and total cost of colleges. The general trends shows that the more students spend on tution and room & board, the more likely that their starting salary is higher.
plot_7 <- formatted_college_tab %>%
ggplot(aes(x=SchoolType, y=MedianStartingSalaryOfAlumniThousands
)) +
geom_boxplot()+
labs(title="Median Starting Salary Of Alumni Based on School Type ", x="School Type", y="Median Starting Salary Of Alumni (Thousands)")
plot_7
Between school types, private colleges seem to have greater starting salaries than public schools, based on the medians of these boxplots.
formatted_college_tab %>% group_by(Selectivity) %>%
summarise(n())
## # A tibble: 3 x 2
## Selectivity `n()`
## <fct> <int>
## 1 More selective 61
## 2 Most selective 44
## 3 Selective 2
#adjusting dataset to remove variables not able to be used in model fitting
college_info <- formatted_college_tab[,-c(1,2)]
head(college_info)
## TuitionFeesThousands RoomBoardThousands TotalEnrollment SchoolType
## 1 47.140 15.610 8273 Private, Coed
## 2 50.420 17.160 20604 Private, Coed
## 3 59.430 14.016 25968 Private, Coed
## 4 51.832 15.510 11466 Private, Coed
## 5 57.006 16.350 13736 Private, Coed
## 6 53.430 16.000 12974 Private, Coed
## YearFounded Setting Endowment2017Millions
## 1 1746 Suburban 23400
## 2 1636 Urban 37100
## 3 1754 Urban 10000
## 4 1861 Urban 14800
## 5 1890 Urban 6600
## 6 1701 City 27200
## MedianStartingSalaryOfAlumniThousands Selectivity
## 1 68.4 Most selective
## 2 66.5 Most selective
## 3 64.9 Most selective
## 4 79.8 Most selective
## 5 57.7 Most selective
## 6 63.2 Most selective
## Fall2017AcceptanceRate MalePercentage FourYearGraduationRate
## 1 0.06 0.51 0.89
## 2 0.05 0.52 0.84
## 3 0.06 0.52 0.88
## 4 0.07 0.54 0.85
## 5 0.09 0.51 0.88
## 6 0.07 0.50 0.87
## TotalCostThousands
## 1 62.750
## 2 67.580
## 3 73.446
## 4 67.342
## 5 73.356
## 6 69.430
college_info$FourYearGraduationRate <- college_info$FourYearGraduationRate*100
college_info$MalePercentage <- college_info$MalePercentage*100
college_info$Fall2017AcceptanceRate <- college_info$Fall2017AcceptanceRate*100
#linear model fitting
tuition_lm_1 <- lm(TuitionFeesThousands~.-RoomBoardThousands-TotalCostThousands, data = college_info)
summary(tuition_lm_1)
##
## Call:
## lm(formula = TuitionFeesThousands ~ . - RoomBoardThousands -
## TotalCostThousands, data = college_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.860 -2.743 0.146 3.237 14.500
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.491e+01 2.646e+01 0.563
## TotalEnrollment -3.283e-05 5.921e-05 -0.555
## SchoolTypePublic, Coed -1.150e+01 1.909e+00 -6.021
## YearFounded 4.876e-03 1.305e-02 0.374
## SettingRural 2.112e-01 2.878e+00 0.073
## SettingSuburban 1.191e+00 1.695e+00 0.703
## SettingUrban 1.695e+00 1.611e+00 1.052
## Endowment2017Millions -5.211e-05 1.483e-04 -0.351
## MedianStartingSalaryOfAlumniThousands 1.396e-01 1.705e-01 0.819
## SelectivityMost selective 4.546e+00 2.168e+00 2.096
## SelectivitySelective -6.810e+00 4.740e+00 -1.437
## Fall2017AcceptanceRate 5.185e-02 5.897e-02 0.879
## MalePercentage 2.208e-02 1.319e-01 0.167
## FourYearGraduationRate 1.754e-01 4.529e-02 3.873
## Pr(>|t|)
## (Intercept) 0.5745
## TotalEnrollment 0.5806
## SchoolTypePublic, Coed 3.42e-08 ***
## YearFounded 0.7095
## SettingRural 0.9417
## SettingSuburban 0.4841
## SettingUrban 0.2957
## Endowment2017Millions 0.7262
## MedianStartingSalaryOfAlumniThousands 0.4151
## SelectivityMost selective 0.0388 *
## SelectivitySelective 0.1541
## Fall2017AcceptanceRate 0.3815
## MalePercentage 0.8674
## FourYearGraduationRate 0.0002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.182 on 93 degrees of freedom
## Multiple R-squared: 0.7003, Adjusted R-squared: 0.6584
## F-statistic: 16.72 on 13 and 93 DF, p-value: < 2.2e-16
plot(tuition_lm_1)
tuition_lm_2 <- step(tuition_lm_1, direction = "both", steps = 1000, trace = F)
summary(tuition_lm_2)
##
## Call:
## lm(formula = TuitionFeesThousands ~ SchoolType + Selectivity +
## FourYearGraduationRate, data = college_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.663 -2.870 0.402 3.025 14.015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.98706 2.52629 15.037 < 2e-16 ***
## SchoolTypePublic, Coed -12.40552 1.29191 -9.602 6.18e-16 ***
## SelectivityMost selective 4.03690 1.42380 2.835 0.005521 **
## SelectivitySelective -7.47358 4.37232 -1.709 0.090437 .
## FourYearGraduationRate 0.14329 0.03637 3.940 0.000149 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.049 on 102 degrees of freedom
## Multiple R-squared: 0.6854, Adjusted R-squared: 0.6731
## F-statistic: 55.55 on 4 and 102 DF, p-value: < 2.2e-16
plot(tuition_lm_2)
anova(tuition_lm_2,tuition_lm_1, test="Chisq")
## Analysis of Variance Table
##
## Model 1: TuitionFeesThousands ~ SchoolType + Selectivity + FourYearGraduationRate
## Model 2: TuitionFeesThousands ~ (RoomBoardThousands + TotalEnrollment +
## SchoolType + YearFounded + Setting + Endowment2017Millions +
## MedianStartingSalaryOfAlumniThousands + Selectivity + Fall2017AcceptanceRate +
## MalePercentage + FourYearGraduationRate + TotalCostThousands) -
## RoomBoardThousands - TotalCostThousands
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 102 3731.7
## 2 93 3554.7 9 176.99 0.8653
#linear model fitting
gradrate_lm_1 <- lm(MedianStartingSalaryOfAlumniThousands~.-TuitionFeesThousands-RoomBoardThousands, data = na.omit(college_info))
summary(gradrate_lm_1)
##
## Call:
## lm(formula = MedianStartingSalaryOfAlumniThousands ~ . - TuitionFeesThousands -
## RoomBoardThousands, data = na.omit(college_info))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3062 -2.3889 -0.2445 1.8739 15.6698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.976e+01 1.595e+01 1.239 0.2186
## TotalEnrollment 8.453e-06 3.614e-05 0.234 0.8156
## SchoolTypePublic, Coed -1.430e+00 1.325e+00 -1.079 0.2833
## YearFounded 5.017e-03 7.928e-03 0.633 0.5284
## SettingRural 1.384e+00 1.741e+00 0.795 0.4286
## SettingSuburban -2.983e-01 1.032e+00 -0.289 0.7732
## SettingUrban 3.407e-01 9.871e-01 0.345 0.7308
## Endowment2017Millions 2.166e-04 8.718e-05 2.485 0.0147 *
## SelectivityMost selective 2.481e+00 1.326e+00 1.872 0.0644 .
## SelectivitySelective 2.567e+00 2.885e+00 0.890 0.3759
## Fall2017AcceptanceRate -8.235e-02 3.478e-02 -2.368 0.0200 *
## MalePercentage 5.486e-01 5.637e-02 9.732 7.55e-16 ***
## FourYearGraduationRate 1.843e-02 2.907e-02 0.634 0.5276
## TotalCostThousands 3.458e-02 5.496e-02 0.629 0.5307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.752 on 93 degrees of freedom
## Multiple R-squared: 0.7248, Adjusted R-squared: 0.6864
## F-statistic: 18.85 on 13 and 93 DF, p-value: < 2.2e-16
plot(gradrate_lm_1)
gradrate_lm_2 <- step(gradrate_lm_1, direction = "both", steps = 1000, trace = F)
summary(gradrate_lm_2)
##
## Call:
## lm(formula = MedianStartingSalaryOfAlumniThousands ~ SchoolType +
## Endowment2017Millions + Selectivity + Fall2017AcceptanceRate +
## MalePercentage, data = na.omit(college_info))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5908 -2.3830 -0.3075 1.8991 15.2461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.341e+01 2.874e+00 11.625 < 2e-16 ***
## SchoolTypePublic, Coed -1.912e+00 8.010e-01 -2.386 0.01889 *
## Endowment2017Millions 1.998e-04 7.306e-05 2.735 0.00738 **
## SelectivityMost selective 2.697e+00 1.244e+00 2.168 0.03252 *
## SelectivitySelective 2.042e+00 2.713e+00 0.752 0.45357
## Fall2017AcceptanceRate -9.100e-02 3.173e-02 -2.868 0.00504 **
## MalePercentage 5.428e-01 4.828e-02 11.243 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.672 on 100 degrees of freedom
## Multiple R-squared: 0.7166, Adjusted R-squared: 0.6996
## F-statistic: 42.15 on 6 and 100 DF, p-value: < 2.2e-16
plot(gradrate_lm_2)
anova(gradrate_lm_2,gradrate_lm_1, test="Chisq")
## Analysis of Variance Table
##
## Model 1: MedianStartingSalaryOfAlumniThousands ~ SchoolType + Endowment2017Millions +
## Selectivity + Fall2017AcceptanceRate + MalePercentage
## Model 2: MedianStartingSalaryOfAlumniThousands ~ (TuitionFeesThousands +
## RoomBoardThousands + TotalEnrollment + SchoolType + YearFounded +
## Setting + Endowment2017Millions + Selectivity + Fall2017AcceptanceRate +
## MalePercentage + FourYearGraduationRate + TotalCostThousands) -
## TuitionFeesThousands - RoomBoardThousands
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 100 1348.5
## 2 93 1309.3 7 39.15 0.9045